本文通过依据不同的数据变量类型,基于ggplot的实现来介绍一些数据可视化基本方法和一些常用的增加可读性的方式。
代码与数据可以在github上找到。
library(ggplot2)
library(dplyr)
library(tidyr)
# 读取数据
players <- read.csv("data/nba_players.csv", stringsAsFactors = FALSE) %>%
mutate(
position = factor(position)
)
teams <- read.csv("data/nba_team.csv", stringsAsFactors = FALSE)
currey_performance <- read.csv("data/nba_curry_performance.csv", stringsAsFactors = FALSE) %>%
mutate(
game_date = as.Date(game_date)
)
my_ggplot_theme <- theme(text = element_text(family = "STKaiti"), # 使用楷体字体显示中文
plot.title = element_text(hjust = 0.5) # 标题居中
)
我们会使用NBA 2016-2017赛季常规赛的球员数据作为可视化的样例数据。
首先,来看一下我们将会使用到样例数据。
knitr::kable(
head(players), booktabs = TRUE
)
| player_id | player_name | avg_pts | avg_ast | avg_oreb | avg_dreb | avg_stl | avg_blk | avg_tov | avg_fgm | avg_fga | avg_tpm | avg_play_time | team_name | position |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 凯尔-科沃尔 | 10.12 | 1.64 | 0.13 | 2.66 | 0.49 | 0.31 | 1.03 | 3.57 | 7.66 | 2.42 | 26.13 | 骑士 | 2.5 |
| 2 | 蒂亚戈-斯普利特 | 4.33 | 0.44 | 0.89 | 1.56 | 0.11 | 0.11 | 0.67 | 1.56 | 3.44 | 0.22 | 8.56 | 76人 | 5 |
| 3 | 保罗-米尔萨普 | 17.87 | 3.63 | 1.59 | 6.13 | 1.30 | 0.87 | 2.29 | 6.17 | 13.94 | 1.09 | 33.66 | 老鹰 | 4 |
| 4 | 萨博-塞福洛沙 | 7.16 | 1.73 | 0.84 | 3.52 | 1.48 | 0.50 | 0.95 | 2.81 | 6.35 | 0.66 | 25.73 | 老鹰 | 2.5 |
| 5 | 杰夫-蒂格 | 15.29 | 7.78 | 0.39 | 3.66 | 1.24 | 0.39 | 2.63 | 4.90 | 11.07 | 1.10 | 32.44 | 步行者 | 1 |
| 7 | 丹尼斯-施罗德 | 17.90 | 6.35 | 0.53 | 2.65 | 0.92 | 0.20 | 3.24 | 6.94 | 15.39 | 1.27 | 31.49 | 老鹰 | 1 |
其中,avg_pts到avg_play_time分别表示球员的平均得分、助攻、前场篮板、后场篮板、抢断、盖帽、失误、命中球数、出手次数、三分命中数、上场时间;position表示球员在场上位置,1.5表示该球员可以打1号位或者2号位。
knitr::kable(
head(teams), booktabs = TRUE
)
| team_id | team_name | division | lon | lat |
|---|---|---|---|---|
| 1 | 老鹰 | 东部 | -84.39215 | 33.75689 |
| 2 | 凯尔特人 | 东部 | -71.06221 | 42.36620 |
| 3 | 鹈鹕 | 西部 | -90.07153 | 29.95107 |
| 4 | 公牛 | 东部 | -87.67418 | 41.88069 |
| 5 | 黄蜂 | 东部 | -80.83934 | 35.22514 |
| 6 | 骑士 | 东部 | -81.68816 | 41.50209 |
lon和lat表示球队的经纬度,由于洛杉矶湖人与快船为一个城市,因此坐标稍作了一些修改。
knitr::kable(
head(currey_performance), booktabs = TRUE
)
| game_date | pts | ast | oreb | dreb | stl | blk | tov | fgm | fga | tpm |
|---|---|---|---|---|---|---|---|---|---|---|
| 2016-10-26 | 26 | 4 | 0 | 3 | 0 | 0 | 4 | 9 | 18 | 3 |
| 2016-10-29 | 23 | 7 | 1 | 0 | 1 | 0 | 4 | 8 | 19 | 4 |
| 2016-10-31 | 28 | 3 | 0 | 1 | 0 | 0 | 1 | 9 | 17 | 5 |
| 2016-11-02 | 28 | 3 | 1 | 3 | 1 | 0 | 4 | 10 | 21 | 5 |
| 2016-11-04 | 21 | 7 | 0 | 1 | 2 | 1 | 1 | 6 | 14 | 2 |
| 2016-11-05 | 13 | 11 | 0 | 8 | 2 | 1 | 2 | 5 | 17 | 0 |
斯蒂芬·库里每场比赛的数据。
我们接下来将会根据不同的变量类型来进行数据可视化。
ggplot(players, aes(x = position)) +
geom_bar() +
labs(x = "球员位置", y = "数量", title = "条形图 —— 各个位置上球员分布") +
my_ggplot_theme
ggplot(players, aes(x = avg_pts)) +
geom_freqpoly() +
labs(x = "得分", y = "数量", title = "频率图 —— 球员得分分布") +
my_ggplot_theme
ggplot(players, aes(x = avg_pts, y = avg_ast)) +
geom_point() +
labs(x = "得分", y = "助攻", title = "散点图 —— 得分与助攻") +
my_ggplot_theme
每个球队在每个位置上的人员分布情况
2.1 以点的大小来展示
ggplot(players, aes(x = team_name, y = position)) +
geom_count() +
labs(x = "球队", y = "位置",
title = "气泡图(大小) —— 每个球队在每个位置上的人员分布") +
my_ggplot_theme +
# 调整x周的标签位置,防止队名重叠
theme(axis.text.x = element_text(angle = 60, hjust = 1))
2.2 以点的颜色来展示
# 每个位置上的球员数量进行统计
team_position_count <- players %>%
count(team_name, position)
ggplot(team_position_count, aes(x = team_name, y = position)) +
geom_point(aes(color = n), size = 4.5) +
labs(x = "球队", y = "位置",
title = "气泡图(颜色) —— 每个球队在每个位置上的人员分布") +
# 调整渐变色
scale_color_gradient(low = "green", high = "red", name = "数量") +
my_ggplot_theme +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
3.1 条形图
展示金州勇士队的球员场均得分qin©˚uan©
gs_player <- filter(players, team_name == "勇士")
# 使用reorder从高到底排列
ggplot(gs_player, aes(x = reorder(player_name, -avg_pts), y = avg_pts)) +
geom_bar(stat = "identity") +
labs(x = "球员", y = "得分", title = "条形图2 —— 勇士队球员场均得分") +
my_ggplot_theme +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
3.2 箱形图
各支球队的球员场均得分分布情况
ggplot(players, aes(x = team_name, y = avg_pts)) +
geom_boxplot() +
labs(x = "球队", y = "得分", title = "箱形图 —— 各支球队的球员场均得分分布") +
my_ggplot_theme +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
库里每场比赛得分情况
ggplot(currey_performance, aes(x = game_date, y = pts)) +
geom_line() +
labs(x = "日期", y = "得分", title = "折线图 —— 库里每场比赛得分") +
my_ggplot_theme
在散点图上展示勇士队球员得分和助攻,用形状来表示位置,颜色来表示上场时间
ggplot(gs_player, aes(x = avg_pts, y = avg_ast)) +
geom_point(aes(shape = position, color = avg_play_time), size = 3) +
labs(x = "得分", y = "助攻", title = "得分、助攻、位置与上场时间") +
my_ggplot_theme +
scale_color_gradient(high = "red", low = "green", name = "上场时间") +
scale_shape_manual(name = "球员位置", values = 1:nlevels(factor(gs_player$position)))
将数据分组进行比较是很常用数据的可视化方式,例如可以数据按照球队、球员位置等分组,进行比较
分别比较勇士、火箭和骑士队的平均得分、助攻、前场篮板、后场篮板、抢断与盖帽数据
player_group <- players %>%
filter(team_name %in% c("勇士", "骑士", "火箭")) %>%
group_by(team_name) %>%
summarise_at(
vars(avg_pts:avg_blk),
funs(
round(mean(.), 2)
)
) %>%
# 转换为长数据
tidyr::gather("item", "value", -team_name) %>%
# 转换为适当的名称
mutate(
item = case_when(item == "avg_pts" ~ "得分",
item == "avg_ast" ~ "助攻",
item == "avg_oreb" ~ "前场篮板",
item == "avg_dreb" ~ "后场篮板",
item == "avg_stl" ~ "抢断",
item == "avg_blk" ~ "盖帽"
),
# 重新排序数据项
item = factor(item, levels = c("得分", "助攻", "前场篮板", "后场篮板", "抢断", "盖帽"))
)
ggplot(player_group, aes(x = item, y = value)) +
geom_bar(aes(fill = team_name), position = "dodge", stat = "identity") +
labs(x = "数据项", y = "数据", title = "勇士、火箭和骑士队的数据比较") +
scale_fill_discrete(name = "队伍") +
my_ggplot_theme
分面按照某个变量进行分组后,分别画出若干图形
3.1 我们观察每个球队的得分、助攻、前场篮板和后场篮板分布情况
team_data <- players %>%
group_by(team_name) %>%
summarise_at(
vars(avg_pts:avg_dreb),
funs(
round(mean(.), 2)
)
) %>%
tidyr::gather("item", "value", -team_name) %>%
mutate(
item = case_when(item == "avg_pts" ~ "得分",
item == "avg_ast" ~ "助攻",
item == "avg_oreb" ~ "前场篮板",
item == "avg_dreb" ~ "后场篮板"
),
item = factor(item, levels = c("得分", "助攻", "前场篮板", "后场篮板"))
)
ggplot(team_data, aes(x = item, y = value)) +
geom_bar(stat = "identity") +
facet_wrap(~ team_name) +
labs(x = "数据项", y = "数据", title = "各球队的数据分布") +
scale_y_continuous(breaks = c(0, 5, 10)) +
my_ggplot_theme +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
3.2 我们比较每个球队的得分、助攻、前场篮板和后场篮板的情况
ggplot(team_data, aes(x = team_name, y = value)) +
geom_bar(stat = "identity") +
facet_grid(item ~ ., scales = "free_y") +
labs(x = "球队", y = "数据", title = "各球队的数据比较") +
my_ggplot_theme +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
通过增加新元素、对坐标轴、图形、数据进行变换等方式,增加可读性
在得分与助攻散点图上,标出斯蒂芬·库里, 勒布朗·詹姆斯, 凯文·杜兰特和詹姆斯·哈登
label_player <- players %>%
filter(player_name %in% c("斯蒂芬-库里", "勒布朗-詹姆斯", "凯文-杜兰特", "詹姆斯-哈登"))
ggplot(players, aes(x = avg_pts, y = avg_ast)) +
geom_point() +
labs(x = "得分", y = "助攻", title = "增加标签") +
geom_point(data = label_player, aes(x = avg_pts, y = avg_ast),
size = 3, shape = 1, color = "red") +
ggrepel::geom_label_repel(data = label_player, aes(x = avg_pts, y = avg_ast, label = player_name),
family = "STKaiti") +
my_ggplot_theme
我们增加地图背景来展示每个球队的平均球员得分情况
library(ggmap)
range_lon <- range(teams$lon)
range_lat <- range(teams$lat)
# 调用ggmap接口下来地图数据
# us_map_range <- c(left = range_lon[1] - 5, right = range_lon[2] + 5,
# bottom = range_lat[1] - 5, top = range_lat[2] + 5)
# us_map <- get_stamenmap(us_map_range, zoom = 5, maptype = "toner-lite")
# 保存地图信息
# save(us_map, file = "us_map.RData")
load("data/us_map.RData")
team_avg_pts <- players %>%
group_by(team_name) %>%
summarise(
avg_pts = round(mean(avg_pts), 2)
) %>%
left_join(teams, by = "team_name")
ggmap(us_map) +
geom_point(data = team_avg_pts, aes(x = lon, y = lat, color = avg_pts), size = 6) +
ggrepel::geom_label_repel(data = team_avg_pts,
aes(x = lon, y = lat, label = paste0(team_name, ":", avg_pts)),
family = "STKaiti") +
scale_color_gradient(high = "red", low = "green", name = "平均球员得分") +
ggtitle("各球队平均球员得分") +
my_ggplot_theme +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
plot.background=element_blank())
查看上场时间与得分关系时,可以添加一条回归线,使得关系更为清晰
fit_lm <- lm(avg_pts ~ avg_play_time, data = players)
lm_formula <- paste0("points = ", round(coef(fit_lm)[1], 1), " + ",
round(coef(fit_lm)[2], 1), " * play_time")
ggplot(players, aes(x = avg_play_time, y = avg_pts)) +
geom_point(alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
geom_text(aes(x = 27, y = 12, label = lm_formula), hjust = 0, color = "blue") +
labs(x = "上场时间", y = "得分", title = "上场时间与得分关系") +
my_ggplot_theme
我们通过旋转x轴,用雷达图展示库里和杜兰特的部分数据的比较
players_cd <- players %>%
select(player_name, avg_pts:avg_blk) %>%
mutate_at(
vars(-player_name),
funs(
as.numeric(scales::rescale(.))
)
) %>%
filter(player_name %in% c("斯蒂芬-库里", "凯文-杜兰特"))
names(players_cd) <- c("球员", "得分", "助攻", "进攻篮板", "防守篮板", "抢断", "盖帽")
ggradar::ggradar(players_cd, font.radar = "STKaiti", axis.label.size = 3)
通过条形图的位置变化,以漏斗图来展示勇士队球员的平均得分
fuel_gs_player <- data.frame(player_name = gs_player$player_name,
label = paste0(gs_player$player_name, ":", gs_player$avg_pts),
help_bar = (max(gs_player$avg_pts) - gs_player$avg_pts) / 2,
avg_pts = gs_player$avg_pts)
fuel_gs_player$player_name <- reorder(fuel_gs_player$player_name, -fuel_gs_player$avg_pts)
fuel_gs_player_tmp <- fuel_gs_player %>%
gather(perform, avg_pts, help_bar, avg_pts) %>%
mutate(
perform = factor(perform, level = c("avg_pts", "help_bar"), order = TRUE)
)
ggplot() +
geom_bar(data = fuel_gs_player_tmp, aes(x = as.integer(player_name), y = avg_pts, fill = perform),
stat = "identity", position = "stack") +
scale_fill_manual(values = c("steelblue", "white")) +
geom_text(data = fuel_gs_player, aes(x = as.integer(player_name), y = help_bar + avg_pts / 2 - 0.05, label = label),
col = "black", size = 3, family = "STKaiti") +
ggtitle("勇士球员得分") +
my_ggplot_theme + guides(fill = "none") +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
plot.background=element_blank()) +
coord_flip() + scale_x_reverse()
上场时间与得分存在较为明显的非线性关系,我们可以对得分进行对数变换,使得线性更为明显。
log_pts_data <- players %>%
select(avg_pts, avg_play_time) %>%
filter(avg_pts > 0) %>%
mutate(
log_avg_pts = log(avg_pts)
)
fit_log_lm <- lm(log_avg_pts ~ avg_play_time, data = log_pts_data)
log_lm_formula <- paste0("log(points) = ", round(coef(fit_log_lm)[1], 1), " + ",
round(coef(fit_log_lm)[2], 1), " * play_time")
ggplot(log_pts_data, aes(x = avg_play_time, y = log_avg_pts)) +
geom_point(alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
geom_text(aes(x = 35, y = 3.5, label = log_lm_formula), hjust = 1, color = "blue") +
labs(x = "上场时间", y = "对数变换后的得分", title = "上场时间与得分关系") +
my_ggplot_theme
利用ggthemr包中的主题,我们可以对上图增加“dust”主题风格
ggplot(log_pts_data, aes(x = avg_play_time, y = log_avg_pts)) +
geom_point(alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
geom_text(aes(x = 35, y = 3.5, label = log_lm_formula), hjust = 1, color = "blue") +
labs(x = "上场时间", y = "对数变换后的得分", title = "上场时间与得分关系") +
ggthemr::ggthemr("dust", layout = "scientific", type = 'outer')$theme +
theme(text = element_text(family = "STKaiti"))
增加图形可读性的方式还有许多,本文只列举了非常小的一分部作为参考。